###################################################
# Auxiliary functions for estimating
# generalization error using 
# the LOO Bootstrap in Efron and Tibshirani 1998
###################################################

fitstats <- function(observed, predicted,baseline.pred1,baseline.pred2=NULL,obs.index){
  absErr <- abs(observed-predicted)
  sqrErr <- absErr^2
  ape <- absErr/abs(observed)
  e.obs <- observed-predicted
  eStarLM <- observed-baseline.pred1
  mraeLM  <- abs(e.obs/eStarLM)
  if(!is.null(baseline.pred2)){
    eStarLASSO <- observed-baseline.pred2
    mraeLASSO  <- abs(e.obs/eStarLASSO)
  }else{
    mraeLASSO  <- rep(NA,length(absErr))
  }
  obs <- rep(obs.index,times=5)
  return(cbind(absErr
             ,sqrErr
             ,ape
             ,mraeLM
             ,mraeLASSO
  ,"obs"=obs.index))
}

## Function to create moving-block bootstrap samples
new.Boot<-function(nsamp=350, mergeData=all.data$fDataForMerge, data=all.data$fData){
  library(data.table)
  mergeData <- mergeData[mergeData[,"uid"]%in%data[,"uid"],] 
  #block builder makes the overlapping blocks within each country
  blockBuilder<-function(y, .country.year.table=country.year.table, .block.length=block.length){
    yearVec=names(.country.year.table[y][[1]])
    country.name=names(.country.year.table[y])
    if(length(yearVec)>3){
      temp=1:length(yearVec)
      selectMat=laply(seq(1:(length(temp)-.block.length+1)), function(x) {seq(x, (x+.block.length-1))})
      return(data.frame(cbind(country.name, matrix(as.numeric(yearVec[selectMat]), nrow=dim(selectMat)[1]))))
    } else{
      out<-c(country.name, yearVec)
      out<-matrix(c(out, rep(NA, (4-length(out)))), nrow=1)
      return(data.frame(out))
    }
  }
  
  # makeSample draws one random sample of blocks and then locateds UID for training and test sets
  makeSample<-function(x, .blockData=blockData, .n.blocks=n.blocks, .mergeData=mergeData, .all.uid=all.uid){
    this.sample<-sample(1:(dim(.blockData)[1]), .n.blocks, replace=TRUE)
    train<-unlist(llply(1:length(this.sample), function(x){
      .mergeData[.mergeData$country==.blockData[this.sample,][x,1] &
                               .mergeData$year%in%blockData[this.sample,][x,2:4],
                             c("uid")]
    }))
    test<-(.all.uid[!all.uid%in%train])
    cats<-(length(table(.mergeData[paste(train),"fraud.score"])))
    return(list(x=x, train=train, test=test, cats=cats))
  }
  
  
  block.length=3 #Unfortunately, I hard coded this.  But it could be adjusted
  
  ## Make a list of election years by country 
  country.year.table<-by(mergeData$year, mergeData$country, table)
  
  blockList<-llply(1:length(country.year.table), blockBuilder, .country.year.table=country.year.table, .block.length=block.length) # A list of overlapping blocks for each country
  blockData<-data.frame(rbindlist(blockList)) #changed to a data frame
  blockData<-blockData[!c(rowSums(is.na(blockData))==3),] # removing rows with missing data
  rownames(blockData)<-1:dim(blockData)[1]

  ## the rbindlist command seems to make everything into a factor.  Undoing.
  blockData$X1<-as.character(blockData$X1)
    blockData$X2<-as.numeric(as.character(blockData$X2))
    blockData$X3<-as.numeric(as.character(blockData$X3))
    blockData$X4<-as.numeric(as.character(blockData$X4))
    rownames(mergeData)<-mergeData$uid
    all.uid<-mergeData$uid
    n.blocks<-round(nrow(data)/mean(rowSums(!is.na(blockData))-1)) 
  sampleUids<-llply(1:nsamp, makeSample, .blockData=blockData, .n.blocks=n.blocks, .mergeData=mergeData, .all.uid=all.uid)
    data  <- as.data.frame(data)
    data  <-  subset(data,select=-c(train)) #remove original distinction
  rownames(data)<-data$uid

  bootsamp<-llply(1:nsamp, function(y){ 
    train=data[sampleUids[[y]]$train,] 
    test=data[sampleUids[[y]]$test,] 
    train$train<-1 
    test$train<-0 
    as.matrix(rbind(train,test)) 
  }) 
  Nbi <- laply(bootsamp,function(x)table(factor(x[x[,"train"]==1,"uid"],levels=1:max(x[,"uid"]))))
  Nb.term  <- scale(Nbi,scale=FALSE)
  return(list(data=bootsamp, Nb=Nb.term))
}

#Wrapper for bart() function
bartWrap <- function(input,data.arg){
  set.seed(831213)
  .allData <- data.arg[[input["sampind"]]]  
  train  <- .allData[,which(colnames(.allData)=="train")]==1
  
  realY.train <- .allData[train,"fraud.score"]
  realY.test <- .allData[!train,"fraud.score"]
  train.data <- .allData[train,-grep("fraud\\.score|train",colnames(.allData))] 
  test.data  <- .allData[!train,-grep("fraud\\.score|train",colnames(.allData))]
  bart.res <- bart(
    x.train = train.data[,-grep("fraud|uid",colnames(train.data))]
    ,y.train = realY.train
    ,x.test = test.data[,-grep("fraud|uid",colnames(test.data))]
    ,k = input["k"]
    ,power = input["pow"]
    ,base = input["base"]
    ,sigdf = input["sigdf"]
    ,sigquant = input["sigquant"]
    ,nskip = input["burnin"]
    ,ndpost = input["niter"]
    ,ntree = input["ntree"]
    ,sigest=0.35
    ,keepevery = input["keepevery"]
    ,keeptrainfits=TRUE  
    ,verbose = TRUE
  )
  res  <- list(bartres=bart.res
               ,y.test=realY.test
               ,train.x=train.data
               ,test.x=test.data
               ,params= paste(input,collapse="-"))
  return(res)
}

#Loss calculation function
lossGen  <- function(bartest,lasso=FALSE,no.inf=FALSE){
  dataset  <- as.data.frame(cbind(bartest$bartres$y,bartest$train.x[,-grep("uid|NA$",colnames(bartest$train.x))]))  
  names(dataset)[1]  <- "y"
  baseline.model1 <- lm(y~fraud.democracy
                        ,data=dataset)
  baseline.preds.lm <- predict(baseline.model1,as.data.frame(bartest$test.x))  
  if(lasso){
    baseline.model2  <- cv.glmnet(bartest$train.x[,-grep("uid|NA$",colnames(bartest$train.x))]
                                                       ,bartest$bartres$y)  
    baseline.preds.lasso  <- predict(baseline.model2
                                     ,bartest$test.x[,-grep("uid|NA$",colnames(bartest$test.x))]
                                     ,s="lambda.min")
    err.test <-   fitstats(observed=bartest$y.test
                           , predicted=bartest$bartres$yhat.test.mean
                           , baseline.pred1=baseline.preds.lm
                           , baseline.pred2=baseline.preds.lasso
                           , obs.index=bartest$test.x[,"uid"])
  }else{
   err.test <-  fitstats(observed=bartest$y.test
                        , predicted=bartest$bartres$yhat.test.mean
                        , baseline.pred1=baseline.preds.lm
                        , obs.index=bartest$test.x[,"uid"])
  }
  if(no.inf==TRUE){
    err.train <-  fitstats(observed=bartest$bart.res$y
                           , predicted=bartest$bartres$yhat.train.mean
                           , baseline.pred1=predict(baseline.model1)
                           , obs.index=bartest$train.x[,"uid"])
    return(list(err.test=err.test,err.train=err.train))
  }
  return(err.test)
}

# Error estimation function
err.generator <- function(n.boot.samp
                          ,param.inputs
                          ,data
                          ,cluster=NULL
                          ,parallel=TRUE
                          ,lasso=FALSE
                         ,kfold=FALSE){
  n.obs  <- nrow(data)
  set.seed(831213)
  #Bootstrap generalization error
  if(n.boot.samp>1){
      if(kfold){
          folds  <- sample(as.numeric(cut(1:nrow(data),n.boot.samp)))
          boot.samples  <- llply(unique(folds),function(x){
              temp.data  <- data
              temp.data[,"train"]  <- as.numeric(folds!=x)
              return(temp.data)
          })
      }else{
              boot.samples.all  <- cboot.maker(n.boot.samp
                                     ,data
                                     ,as.numeric(cluster)
                                     ,FALSE)
    boot.samples  <- boot.samples.all$data
    Nb.term  <- boot.samples.all$Nb
  } 
   }else{
    boot.samples <- list(data)
}
  boot.bart  <- alply(param.inputs
                         ,1
                         ,bartWrap
                         ,data.arg=boot.samples
                         ,.parallel=parallel
                         ,.paropts=list(.export=c("fitstats"),.packages=c("plyr","glmnet","BayesTree"))
  )
  
  boot.err.dis  <- ldply(boot.bart,lossGen,lasso=lasso)
  
  if(kfold){
    return(list(error=numcolwise(mean)(boot.err.dis)
                ,ind.errors=boot.err.dis
                ,bart.res=boot.bart))
  }
  
  boot.err.ind <- ddply(boot.err.dis,c("obs"),.fun=function(x){
    Ib  <- nrow(x)
    qi.plus <- numcolwise(sum)(x)
    Ei <- numcolwise(mean)(x)
    return(data.frame(Ei=t(Ei),qi.plus=t(qi.plus),Ib=Ib,loss=names(boot.err.dis)[2:6]))
  })
  
  qe.function <- function(y){
    relevant.data  <- subset(boot.err.ind,loss==names(y))  
    
    qdotb <- sum(y[,1])/n.obs
    
    obs.in.test  <- relevant.data$obs%in%rownames(y)
    obs.in.train  <- !obs.in.test
    Eib.in.test <- (relevant.data$qi.plus[obs.in.test] - y[,1])/(relevant.data$Ib[obs.in.test]-1)
    Eib.in.train  <- relevant.data$Ei[obs.in.train]
    errb  <- sum(c(Eib.in.test,Eib.in.train),na.rm=TRUE)/n.obs
    return(c(qdotb,errb))
  }
  if(n.boot.samp>1){
    qb.eb  <- daply(boot.err.dis,"X1",function(x){  
      rownames(x) <- x$obs
      aaply(x[,2:6],2,qe.function)      
    })
  NbNumerator <- data.frame(loss=rep(colnames(qb.eb),n.obs),NbNum=c(aaply(qb.eb[,,1],2,function(x)x%*%Nb.term)),obs=rep(1:n.obs,each=5))
  sd.int  <- aaply(qb.eb[,,2],2,function(x){
    sqrt(((n.boot.samp-1)/n.boot.samp)*sum((x-mean(x))^2))
  })
  }
  
  err.hack <- daply(boot.err.ind,"loss",function(x){
    if(unique(x[,"loss"])=="sqrErr"){
      mean(x[,"Ei"])
    }else{
      median(x[,"Ei"])
    }
  })
  
  err.normal <- daply(boot.err.ind,"loss",function(x){
      mean(x[,"Ei"])
  })
  
  if(n.boot.samp>1){
  boot.err.ind  <- merge(boot.err.ind,NbNumerator,all.x=TRUE)
  Di <- ddply(boot.err.ind,c("obs"),function(x){
    (2+1/(n.obs-1))*((x$Ei-err.normal)/n.obs)+(x$NbNum/x$Ib)
  })
  err.se  <- sqrt(colSums(Di[,-1]^2)-sd.int^2)    
  }
  if(n.boot.samp>1){
  return(list(error=err.normal
              ,error.se=err.se
              ,error.hack=err.hack
              ,boot.samples =boot.samples
              ,bart.res = boot.bart
              ,ind.errors = boot.err.dis
  ))}
  else{
    return(list(error=err.normal
                ,error.hack=err.hack
                ,bart.res = boot.bart
                ,ind.errors=boot.err.dis
    ))
  }
  
}  



noinf.err.generator  <- function(param.inputs #as vector
                                 ,data
                                 ,boot.err){
  
  
  #Create no information permutation
  unique.y.tab  <- table(data[,grep("fraud.score$",colnames(data))])  
  noinf.index <- as.matrix(expand.grid(outcome=names(unique.y.tab)
                                       ,preds=c(1:nrow(data))))
  ninf.data  <- aaply(noinf.index,1
                      ,.fun=function(x)c(as.numeric(x[1])
                                         ,data[as.numeric(x[2]),-grep("fraud\\.s",colnames(data))]
                                         ,unique.y.tab[which(names(unique.y.tab)==x[1])])
  )  
  colnames(ninf.data)[c(1,ncol(ninf.data))]  <- c("fraud.score","weight")
  ninf.data[,grep("train",colnames(ninf.data))]  <- 0
  #combine with original data
  data.new  <- data
  data.new[,grep("train",colnames(data.new))]  <- 1
  data.new <- cbind(data.new[,grep("fraud.score$",colnames(data.new))]
                    ,data.new[,-grep("fraud.score$",colnames(data.new))]
                    ,weight=rep(1,nrow(data.new)))
  
  data.new <- rbind(ninf.data,data.new)
  
  
  tr.ninf.bart  <- bartTest(param.inputs
                            ,list(data.new[,-grep("weight",colnames(data.new))])
                            ,no.inf=TRUE)
  
  #In-sample error
  train.err  <- daply(tr.ninf.bart$err.train,"loss"
                      ,.fun=function(x)colMeans(x[,-grep("loss|obs",colnames(x))])
  )
  #No information error
  gamma.hat  <- daply(tr.ninf.bart$err.test
                      ,"loss"
                      ,.fun=function(x){
                        as.matrix(t(x[,-grep("loss|obs",colnames(x))]))%*%ninf.data[,"weight"]/(nrow(data)^2)
                      }
  )
  
  R.hat <- (boot.err-train.err)/(gamma.hat-train.err)
  w.hat  <- 0.632/(1-0.368*R.hat)
  
  plus.err  <- (1-w.hat)*train.err+w.hat*boot.err
  
  plus.err.summ <- aaply(plus.err,1,quantile,probs=c(0.05,0.5,0.95))
  return(list(plus.err=plus.err,plus.err.sum=plus.err.summ,weight=w.hat,train=train.err,R=R.hat,gamma=gamma.hat))  
}

